Model-3 Bidirectional LSTM model using embedding layer
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score,f1_score
import time
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
# Read the CSV file into a pandas DataFrame
df = pd.read_csv('/content/drive/MyDrive/preprocessed.csv')
print(df.head())
Tweet Sentiment \
0 @_angelica_toy Happy Anniversary!!!....The Day... 1
1 @McfarlaneGlenda Happy Anniversary!!!....The D... 1
2 @thevivafrei @JustinTrudeau Happy Anniversary!... 1
3 @NChartierET Happy Anniversary!!!....The Day t... 1
4 @tabithapeters05 Happy Anniversary!!!....The D... 1
stemmed_content
0 angelica toy happi anniversari day freedumb di...
1 mcfarlaneglenda happi anniversari day freedumb...
2 thevivafrei justintrudeau happi anniversari da...
3 nchartieret happi anniversari day freedumb die...
4 tabithapet happi anniversari day freedumb die ...
# Drop rows with NaN values in the 'Tweet' column
df.dropna(subset=['Tweet'], inplace=True)
df.Sentiment.value_counts()
4 233700 2 77015 1 64004 3 42556 0 34056 Name: Sentiment, dtype: int64
# Define a dictionary to map numerical labels to sentiment names
sentiment_mapping = {
4: 'Strong_Pos',
3: 'Strong_Neg',
2: 'Neutral',
1: 'Mild_Pos',
0: 'Mild_Neg'
}
df['Sentiment'] = df['Sentiment'].map(sentiment_mapping)
# Count the occurrences of each unique sentiment
sentiment_counts = df['Sentiment'].value_counts()
# Plotting
plt.figure(figsize=(10, 6))
# Plotting the bar chart with a different color (green)
sentiment_counts.plot(kind='bar', color='green')
# Adding title and labels
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
# Display the plot
plt.show()
df.Sentiment.value_counts()
Strong_Pos 233700 Neutral 77015 Mild_Pos 64004 Strong_Neg 42556 Mild_Neg 34056 Name: Sentiment, dtype: int64
# Define input and output columns
X = df['Tweet']
y = df['Sentiment']
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
# Pad sequences
max_sequence_length = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
sentiment_mapping = {'Strong_Pos' : 4 , 'Neutral' : 3 ,'Mild_Pos': 2 ,'Strong_Neg' : 1 ,'Mild_Neg': 0 }
y_categorical = [sentiment_mapping[sentiment] for sentiment in y]
y_categorical = to_categorical(y_categorical)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_categorical, test_size=0.2, random_state=42)
Bidirectional LSTM model using embedding layer
Model Configuration and Training
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
# Define vocabulary size (number of unique words)
num_words = len(tokenizer.word_index) + 1 # Add 1 for the padding token
# Define embedding dimension
embedding_dim = 50
# Define max sequence length
max_sequence_length = 83
# Define number of classes
num_classes = len(sentiment_mapping)
# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dropout(0.5))
model.add(Dense(units=num_classes, activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
start_time = time.time()
history = model.fit(X_train, y_train, epochs=2, batch_size=128, validation_split=0.2)
end_time = time.time()
train_time = end_time - start_time
print("Training Time:", train_time, "seconds")
Epoch 1/2 2257/2257 [==============================] - 3788s 2s/step - loss: 0.7096 - accuracy: 0.7412 - val_loss: 0.5059 - val_accuracy: 0.8174 Epoch 2/2 2257/2257 [==============================] - 3584s 2s/step - loss: 0.4230 - accuracy: 0.8468 - val_loss: 0.4981 - val_accuracy: 0.8225 Training Time: 7410.188728094101 seconds
Training Data Check
# Generate predictions for training data
y_train_pred = model.predict(X_train)
y_train_pred_classes = np.argmax(y_train_pred, axis=1)
# Compute AUC for each class
auc_positive = roc_auc_score(y_train[:, 1], y_train_pred[:, 1])
auc_negative = roc_auc_score(y_train[:, 0], y_train_pred[:, 0])
print("AUC for Positive Class:", auc_positive)
print("AUC for Negative Class:", auc_negative)
# Construct confusion matrix for training data
cm_train = confusion_matrix(np.argmax(y_train, axis=1), y_train_pred_classes)
print("Confusion Matrix for Training Data:")
print(cm_train)
# Classification Report for training data
print("Classification Report for Training Data:")
print(classification_report(np.argmax(y_train, axis=1), y_train_pred_classes))
# Compute F1 score for each class
f1_score_positive = f1_score(np.argmax(y_train, axis=1), y_train_pred_classes, average=None)[1]
f1_score_negative = f1_score(np.argmax(y_train, axis=1), y_train_pred_classes, average=None)[0]
print("F1 Score for Positive:", f1_score_positive)
print("F1 Score for Negative:", f1_score_negative)
accuracy = accuracy_score(np.argmax(y_train, axis=1), y_train_pred_classes)
print("Accuracy:", accuracy)
11284/11284 [==============================] - 1296s 115ms/step
AUC for Positive Class: 0.9900555967282751
AUC for Negative Class: 0.9720893999350884
Confusion Matrix for Training Data:
[[ 17261 5036 781 3392 769]
[ 1908 30444 254 914 552]
[ 482 472 37956 4800 7444]
[ 3120 1486 3503 50764 2784]
[ 217 384 3550 1183 181608]]
Classification Report for Training Data:
precision recall f1-score support
0 0.75 0.63 0.69 27239
1 0.80 0.89 0.85 34072
2 0.82 0.74 0.78 51154
3 0.83 0.82 0.83 61657
4 0.94 0.97 0.96 186942
accuracy 0.88 361064
macro avg 0.83 0.81 0.82 361064
weighted avg 0.88 0.88 0.88 361064
F1 Score for Positive: 0.8469135115586837
F1 Score for Negative: 0.6873195691560315
Accuracy: 0.8808216825825892
Ploting ROC cruve on traing data
# Compute predicted probabilities for each class
y_scores_proba_train = model.predict(X_train)
# Compute ROC curve and ROC area for each class on training set
fpr_train = dict()
tpr_train = dict()
roc_auc_train = dict()
for i in range(num_classes):
fpr_train[i], tpr_train[i], _ = roc_curve(y_train[:, i], y_scores_proba_train[:, i])
roc_auc_train[i] = auc(fpr_train[i], tpr_train[i])
# Plot ROC curve for training set
plt.figure(figsize=(8, 6))
for i in range(num_classes):
plt.plot(fpr_train[i], tpr_train[i], label=f'Class {i} (AUC = {roc_auc_train[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Model (Training Set)')
plt.legend(loc="lower right")
plt.show()
11284/11284 [==============================] - 1310s 116ms/step
Testing Data Check
# Generate predictions for testing data
y_test_pred = model.predict(X_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)
# Compute AUC for each class
auc_positive = roc_auc_score(y_test[:, 1], y_test_pred[:, 1])
auc_negative = roc_auc_score(y_test[:, 0], y_test_pred[:, 0])
print("AUC for Positive Class:", auc_positive)
print("AUC for Negative Class:", auc_negative)
# Construct confusion matrix for testing data
cm_test = confusion_matrix(np.argmax(y_test, axis=1), y_test_pred_classes)
print("Confusion Matrix for Testing Data:")
print(cm_test)
# Classification Report for testing data
print("Classification Report for Testing Data:")
print(classification_report(np.argmax(y_test, axis=1), y_test_pred_classes))
# Compute F1 score for each class
f1_score_positive = f1_score(np.argmax(y_test, axis=1), y_test_pred_classes, average=None)[1]
f1_score_negative = f1_score(np.argmax(y_test, axis=1), y_test_pred_classes, average=None)[0]
print("F1 Score for Positive:", f1_score_positive)
print("F1 Score for Negative:", f1_score_negative)
accuracy = accuracy_score(np.argmax(y_test, axis=1), y_test_pred_classes)
print("Accuracy:", accuracy)
2821/2821 [==============================] - 327s 116ms/step
AUC for Positive Class: 0.9801403752468922
AUC for Negative Class: 0.9496297936299771
Confusion Matrix for Testing Data:
[[ 3933 1195 306 1100 283]
[ 1045 6761 119 345 214]
[ 217 160 8362 1603 2508]
[ 917 405 1255 11701 1080]
[ 96 133 2312 783 43434]]
Classification Report for Testing Data:
precision recall f1-score support
0 0.63 0.58 0.60 6817
1 0.78 0.80 0.79 8484
2 0.68 0.65 0.66 12850
3 0.75 0.76 0.76 15358
4 0.91 0.93 0.92 46758
accuracy 0.82 90267
macro avg 0.75 0.74 0.75 90267
weighted avg 0.82 0.82 0.82 90267
F1 Score for Positive: 0.789006885284164
F1 Score for Negative: 0.603915547024952
Accuracy: 0.8219061229463702
Ploting ROC cruve on testing data
# Compute predicted probabilities for each class on testing data
y_scores_proba_test = model.predict(X_test)
# Compute ROC curve and ROC area for each class on testing set
fpr_test = dict()
tpr_test = dict()
roc_auc_test = dict()
for i in range(num_classes): # Assuming num_classes is defined
fpr_test[i], tpr_test[i], _ = roc_curve(y_test[:, i], y_scores_proba_test[:, i])
roc_auc_test[i] = auc(fpr_test[i], tpr_test[i])
# Plot ROC curve for testing set
plt.figure(figsize=(8, 6))
for i in range(num_classes): # Assuming num_classes is defined
plt.plot(fpr_test[i], tpr_test[i], label=f'Class {i} (AUC = {roc_auc_test[i]:0.2f})')
plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Model (Testing Set)')
plt.legend(loc="lower right")
plt.show()
2821/2821 [==============================] - 322s 114ms/step
top 2 features with the highest weights
# Get the weights of the embedding layer
embedding_weights = model.layers[0].get_weights()[0]
# Get the word index
word_index = tokenizer.word_index
# Calculate the norms of the embedding vectors
embedding_norms = np.linalg.norm(embedding_weights, axis=1)
# Get the indices of the top 2 features with the highest norms
top_features_indices = np.argsort(embedding_norms)[-2:]
# Print the top 2 features with their corresponding weights
print("Top 2 Features with the Highest Weights:")
for i, index in enumerate(top_features_indices):
word = list(word_index.keys())[list(word_index.values()).index(index)]
weight = embedding_norms[index]
print(f"Feature: {word}, Weight: {weight:.4f}")
Top 2 Features with the Highest Weights: Feature: perfectly, Weight: 1.7785 Feature: but, Weight: 2.0063
# Define the number of data points to keep for each sentiment class
num_data_per_sentiment = 5000
# Filter the data to keep only 5000 data points for each sentiment class
X_filtered = []
y_filtered = []
for sentiment, data in zip(y, X_padded):
sentiment_label = sentiment_mapping[sentiment]
if y_filtered.count(sentiment_label) < num_data_per_sentiment:
X_filtered.append(data)
y_filtered.append(sentiment_label)
# Convert the filtered data to numpy arrays
X_filtered = np.array(X_filtered)
y_filtered = np.array(y_filtered)
# Convert the labels to categorical format
y_categorical_filtered = to_categorical(y_filtered)
# Train-test split on the filtered dataset
X_train, X_test, y_train, y_test = train_test_split(
X_filtered, y_categorical_filtered, test_size=0.2, random_state=42)
# Print the shapes of the filtered datasets
print("Shapes of Filtered Data:")
print("X_train:", X_train.shape)
print("X_test:", X_test.shape)
print("y_train:", y_train.shape)
print("y_test:", y_test.shape)
# Define hyperparameter combinations for each iteration
hyperparameters = [
{'units': 64, 'dropout_rate': 0.3},
{'units': 128, 'dropout_rate': 0.5},
{'units': 256, 'dropout_rate': 0.7}
]
# List to store validation accuracies for each iteration
validation_accuracies = []
# Ensure that all indices in X_train and X_test are within the range [0, num_words - 1]
X_train_clipped = np.clip(X_train, 0, num_words - 1)
X_test_clipped = np.clip(X_test, 0, num_words - 1)
# Perform hyperparameter tuning for each iteration
for i, params in enumerate(hyperparameters, start=1):
print(f"Iteration {i}: Hyperparameters - {params}")
# Define model architecture with current hyperparameters
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(units=params['units'], return_sequences=True)))
model.add(Dropout(params['dropout_rate']))
model.add(Bidirectional(LSTM(units=int(params['units'] / 2))))
model.add(Dropout(params['dropout_rate']))
model.add(Dense(units=num_classes, activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Train the model
history = model.fit(X_train_clipped, y_train, epochs=1, batch_size=128, validation_split=0.2, verbose=0)
# Get validation accuracy from training history
validation_accuracy = history.history['val_accuracy'][0]
validation_accuracies.append(validation_accuracy)
print(f"Validation Accuracy for Iteration {i}: {validation_accuracy}")
# Print the validation accuracies for each iteration
for i, acc in enumerate(validation_accuracies, start=1):
print(f"Iteration {i} Validation Accuracy: {acc}")
Shapes of Filtered Data:
X_train: (20000, 83)
X_test: (5000, 83)
y_train: (20000, 5)
y_test: (5000, 5)
Iteration 1: Hyperparameters - {'units': 64, 'dropout_rate': 0.3}
Validation Accuracy for Iteration 1: 0.46799999475479126
Iteration 2: Hyperparameters - {'units': 128, 'dropout_rate': 0.5}
Validation Accuracy for Iteration 2: 0.484250009059906
Iteration 3: Hyperparameters - {'units': 256, 'dropout_rate': 0.7}
Validation Accuracy for Iteration 3: 0.49549999833106995
Iteration 1 Validation Accuracy: 0.46799999475479126
Iteration 2 Validation Accuracy: 0.484250009059906
Iteration 3 Validation Accuracy: 0.49549999833106995
cross-validation for 3 fold on sample data 5000
# Define KFold cross-validation
kf = KFold(n_splits=3, shuffle=True, random_state=42)
# Initialize list to store accuracy scores
cv_scores = []
# Iterate over the cross-validation splits
for train_index, val_index in kf.split(X_train_clipped):
# Define model architecture
model = Sequential()
model.add(Embedding(input_dim=num_words, output_dim=embedding_dim))
model.add(Bidirectional(LSTM(units=128, return_sequences=True)))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(units=64)))
model.add(Dropout(0.5))
model.add(Dense(units=num_classes, activation='softmax'))
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
# Fit the model on the training data for this fold
model.fit(X_train_clipped, y_train, epochs=1, batch_size=128, verbose=0)
# Generate predictions for the validation fold
val_predictions = model.predict(X_train_clipped)
# Convert predictions to class labels
val_pred_classes = np.argmax(val_predictions, axis=1)
# Calculate accuracy for the validation fold
val_accuracy = accuracy_score(np.argmax(y_train, axis=1), val_pred_classes)
# Append accuracy to list
cv_scores.append(val_accuracy)
# Convert list to numpy array
cv_scores = np.array(cv_scores)
# Print cross-validation scores
print("Cross-validation Scores:", cv_scores)
# Calculate and print mean cross-validation accuracy
mean_cv_accuracy = np.mean(cv_scores)
print("Mean CV Accuracy:", mean_cv_accuracy)
625/625 [==============================] - 73s 114ms/step 625/625 [==============================] - 79s 120ms/step 625/625 [==============================] - 79s 124ms/step Cross-validation Scores: [0.6047 0.59495 0.59995] Mean CV Accuracy: 0.5998666666666667
pip install lime
Collecting lime
Downloading lime-0.2.0.1.tar.gz (275 kB)
āāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāāā 275.7/275.7 kB 4.9 MB/s eta 0:00:00
Preparing metadata (setup.py) ... done
Requirement already satisfied: matplotlib in /usr/local/lib/python3.10/dist-packages (from lime) (3.7.1)
Requirement already satisfied: numpy in /usr/local/lib/python3.10/dist-packages (from lime) (1.25.2)
Requirement already satisfied: scipy in /usr/local/lib/python3.10/dist-packages (from lime) (1.11.4)
Requirement already satisfied: tqdm in /usr/local/lib/python3.10/dist-packages (from lime) (4.66.2)
Requirement already satisfied: scikit-learn>=0.18 in /usr/local/lib/python3.10/dist-packages (from lime) (1.2.2)
Requirement already satisfied: scikit-image>=0.12 in /usr/local/lib/python3.10/dist-packages (from lime) (0.19.3)
Requirement already satisfied: networkx>=2.2 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (3.2.1)
Requirement already satisfied: pillow!=7.1.0,!=7.1.1,!=8.3.0,>=6.1.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (9.4.0)
Requirement already satisfied: imageio>=2.4.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2.31.6)
Requirement already satisfied: tifffile>=2019.7.26 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (2024.2.12)
Requirement already satisfied: PyWavelets>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (1.5.0)
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from scikit-image>=0.12->lime) (24.0)
Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (1.3.2)
Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.18->lime) (3.3.0)
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.2.0)
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (0.12.1)
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (4.49.0)
Requirement already satisfied: kiwisolver>=1.0.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (1.4.5)
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (3.1.2)
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.10/dist-packages (from matplotlib->lime) (2.8.2)
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.10/dist-packages (from python-dateutil>=2.7->matplotlib->lime) (1.16.0)
Building wheels for collected packages: lime
Building wheel for lime (setup.py) ... done
Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283835 sha256=d2d22c4dd7d3b5dfc9787b2e31bc5ec08502bfe8a560cbaa74d83cb807834c06
Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1
Model Interpretability using LIME
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from lime.lime_text import LimeTextExplainer
# Take a subset of 5000 data samples
X_subset = X[:5000]
y_subset = y[:5000]
# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_subset)
X_sequences = tokenizer.texts_to_sequences(X_subset)
# Pad sequences
max_sequence_length = max([len(seq) for seq in X_sequences])
X_padded = pad_sequences(X_sequences, maxlen=max_sequence_length)
# Define sentiment mapping
# Define a dictionary mapping numerical sentiment to its corresponding label
sentiment_mapping = {'Strong_Pos': 4, 'Neutral': 3, 'Mild_Pos': 2, 'Strong_Neg': 1, 'Mild_Neg': 0}
y_categorical = [sentiment_mapping[sentiment] for sentiment in y_subset]
y_categorical = to_categorical(y_categorical)
# Define vocabulary size (number of unique words)
num_words = len(tokenizer.word_index) + 1 # Add 1 for the padding token
# Define number of classes
num_classes = len(sentiment_mapping)
# Create LimeTextExplainer
explainer = LimeTextExplainer(class_names=list(sentiment_mapping.keys()))
# Choose a sample for explanation (you can change the sample_index)
sample_index = 0
sample_text = X_subset[sample_index]
true_sentiment = y_categorical[sample_index]
# Define a function to predict probabilities using the model
def predict_proba(texts):
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)
return model.predict(padded_sequences)
# Predict sentiment probabilities
predicted_probabilities = predict_proba([sample_text])[0]
# Define a dictionary mapping numerical sentiment to its corresponding label
sentiment_labels = {0: 'Mild_Neg', 1: 'Strong_Neg', 2: 'Mild_Pos', 3: 'Neutral', 4: 'Strong_Pos'}
# Get the label of the predicted sentiment
predicted_sentiment_label = sentiment_labels[np.argmax(predicted_probabilities)]
# Generate local explanation
explanation = explainer.explain_instance(sample_text, predict_proba, num_features=10)
# Print true sentiment, predicted sentiment, and local explanation
#print("True Sentiment:", true_sentiment_label)
print("Predicted Sentiment:", predicted_sentiment_label)
explanation.show_in_notebook(text=True)
1/1 [==============================] - 0s 144ms/step 157/157 [==============================] - 21s 132ms/step Predicted Sentiment: Mild_Pos
# Iterate over each sentiment category
for label in sentiment_mapping:
print("Sentiment:", label)
# Get the indices of samples with the current sentiment label
indices = np.where(y_subset == label)[0]
if len(indices) > 0:
# Print explanations for the first 5 samples
count = 0
for sample_index in indices:
sample_text = X_subset[sample_index]
true_sentiment = y_categorical[sample_index]
# Generate explanation for the sample
explanation = explainer.explain_instance(sample_text, predict_proba, num_features=10)
# Print the sample text
print("Sample Text:", sample_text)
# Print the explanation
explanation.show_in_notebook(text=True)
count += 1
if count == 2:
break
else:
print("No samples found for this sentiment category.")
Sentiment: Strong_Pos 157/157 [==============================] - 14s 87ms/step Sample Text: Freedom Convoy as InkBlot Test https://t.co/auLrduDpdI
157/157 [==============================] - 16s 101ms/step Sample Text: @mark_slapinski Well itĆ¢ā¬ā¢s pretty easy to see what their agenda is and Pierre has remained silent on the issues and he never actually fought for the convoy just did a photo op
Sentiment: Neutral 157/157 [==============================] - 20s 126ms/step Sample Text: @JustinTrudeau You Belong In Jail. #VaccineMandates #CrimesAgainstHumanity #TrudeauDictatorship #FreedomConvoy https://t.co/HrsYk2IYXC
157/157 [==============================] - 20s 129ms/step Sample Text: #FreeDumbConvoy #FreedomConvoy #Freedumbers #freedumb #freedom
Sentiment: Mild_Pos 157/157 [==============================] - 23s 144ms/step Sample Text: @_angelica_toy Happy Anniversary!!!....The Day the FreeDUMB Died (In the tune of Don McLean's "American Pie") #FreeDumbConvoy #Freedumbers #FluTruxKlan #convoywatch #convoy #FreedomConvoy https://t.co/ZT1cIPwmh9
157/157 [==============================] - 24s 156ms/step Sample Text: @McfarlaneGlenda Happy Anniversary!!!....The Day the FreeDUMB Died (In the tune of Don McLean's "American Pie") #FreeDumbConvoy #Freedumbers #FluTruxKlan #convoywatch #convoy #FreedomConvoy https://t.co/ZT1cIPwmh9
Sentiment: Strong_Neg 157/157 [==============================] - 25s 160ms/step Sample Text: @brethordark The #FreedomConvoy 1 year Anniversary... they don't like FREEDOM or HONKING!!!
157/157 [==============================] - 25s 158ms/step Sample Text: #Freedumbers partied as they caused fellow Canadians to be prisoners in their own homes When the pandemic came, their inconvenience was much more important, any lives lost be damned But #TrudeauIsAPsychopath? ðŸ¤ā Nah #FreeDumbConvoy #cdnpoli #FreedomConvoy #TrudeauWasRight
Sentiment: Mild_Neg 157/157 [==============================] - 25s 158ms/step Sample Text: @FightHaven Those knee drops remind me of something... Oh right. Trudeau's crackdown on the #FreedomConvoy Trucker's protest.
157/157 [==============================] - 24s 155ms/step Sample Text: @AndreLemelin4 We are a sovereign nation with a democratically elected government - planting another countryĆ¢ā¬ā¢s flag in front of our federal parliament (particularly in relation to this Ć¢ā¬Åfreedom convoyĆ¢ā¬ļæ½) strongly implies treasonous intent!
!jupyter nbconvert --to html Copy of NLP_3_Q-3.ipynb
[NbConvertApp] WARNING | pattern 'Copy' matched no files
[NbConvertApp] WARNING | pattern 'of' matched no files
[NbConvertApp] WARNING | pattern 'NLP_3_Q-3.ipynb' matched no files
This application is used to convert notebook files (*.ipynb)
to various other formats.
WARNING: THE COMMANDLINE INTERFACE MAY CHANGE IN FUTURE RELEASES.
Options
=======
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
<cmd> --help-all
--debug
set log level to logging.DEBUG (maximize logging output)
Equivalent to: [--Application.log_level=10]
--show-config
Show the application's configuration (human-readable format)
Equivalent to: [--Application.show_config=True]
--show-config-json
Show the application's configuration (json format)
Equivalent to: [--Application.show_config_json=True]
--generate-config
generate default config file
Equivalent to: [--JupyterApp.generate_config=True]
-y
Answer yes to any questions instead of prompting.
Equivalent to: [--JupyterApp.answer_yes=True]
--execute
Execute the notebook prior to export.
Equivalent to: [--ExecutePreprocessor.enabled=True]
--allow-errors
Continue notebook execution even if one of the cells throws an error and include the error message in the cell output (the default behaviour is to abort conversion). This flag is only relevant if '--execute' was specified, too.
Equivalent to: [--ExecutePreprocessor.allow_errors=True]
--stdin
read a single notebook file from stdin. Write the resulting notebook with default basename 'notebook.*'
Equivalent to: [--NbConvertApp.from_stdin=True]
--stdout
Write notebook output to stdout instead of files.
Equivalent to: [--NbConvertApp.writer_class=StdoutWriter]
--inplace
Run nbconvert in place, overwriting the existing notebook (only
relevant when converting to notebook format)
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory=]
--clear-output
Clear output of current file and save in place,
overwriting the existing notebook.
Equivalent to: [--NbConvertApp.use_output_suffix=False --NbConvertApp.export_format=notebook --FilesWriter.build_directory= --ClearOutputPreprocessor.enabled=True]
--no-prompt
Exclude input and output prompts from converted document.
Equivalent to: [--TemplateExporter.exclude_input_prompt=True --TemplateExporter.exclude_output_prompt=True]
--no-input
Exclude input cells and output prompts from converted document.
This mode is ideal for generating code-free reports.
Equivalent to: [--TemplateExporter.exclude_output_prompt=True --TemplateExporter.exclude_input=True --TemplateExporter.exclude_input_prompt=True]
--allow-chromium-download
Whether to allow downloading chromium if no suitable version is found on the system.
Equivalent to: [--WebPDFExporter.allow_chromium_download=True]
--disable-chromium-sandbox
Disable chromium security sandbox when converting to PDF..
Equivalent to: [--WebPDFExporter.disable_sandbox=True]
--show-input
Shows code input. This flag is only useful for dejavu users.
Equivalent to: [--TemplateExporter.exclude_input=False]
--embed-images
Embed the images as base64 dataurls in the output. This flag is only useful for the HTML/WebPDF/Slides exports.
Equivalent to: [--HTMLExporter.embed_images=True]
--sanitize-html
Whether the HTML in Markdown cells and cell outputs should be sanitized..
Equivalent to: [--HTMLExporter.sanitize_html=True]
--log-level=<Enum>
Set the log level by value or name.
Choices: any of [0, 10, 20, 30, 40, 50, 'DEBUG', 'INFO', 'WARN', 'ERROR', 'CRITICAL']
Default: 30
Equivalent to: [--Application.log_level]
--config=<Unicode>
Full path of a config file.
Default: ''
Equivalent to: [--JupyterApp.config_file]
--to=<Unicode>
The export format to be used, either one of the built-in formats
['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf']
or a dotted object name that represents the import path for an
``Exporter`` class
Default: ''
Equivalent to: [--NbConvertApp.export_format]
--template=<Unicode>
Name of the template to use
Default: ''
Equivalent to: [--TemplateExporter.template_name]
--template-file=<Unicode>
Name of the template file to use
Default: None
Equivalent to: [--TemplateExporter.template_file]
--theme=<Unicode>
Template specific theme(e.g. the name of a JupyterLab CSS theme distributed
as prebuilt extension for the lab template)
Default: 'light'
Equivalent to: [--HTMLExporter.theme]
--sanitize_html=<Bool>
Whether the HTML in Markdown cells and cell outputs should be sanitized.This
should be set to True by nbviewer or similar tools.
Default: False
Equivalent to: [--HTMLExporter.sanitize_html]
--writer=<DottedObjectName>
Writer class used to write the
results of the conversion
Default: 'FilesWriter'
Equivalent to: [--NbConvertApp.writer_class]
--post=<DottedOrNone>
PostProcessor class used to write the
results of the conversion
Default: ''
Equivalent to: [--NbConvertApp.postprocessor_class]
--output=<Unicode>
overwrite base name use for output files.
can only be used when converting one notebook at a time.
Default: ''
Equivalent to: [--NbConvertApp.output_base]
--output-dir=<Unicode>
Directory to write output(s) to. Defaults
to output to the directory of each notebook. To recover
previous default behaviour (outputting to the current
working directory) use . as the flag value.
Default: ''
Equivalent to: [--FilesWriter.build_directory]
--reveal-prefix=<Unicode>
The URL prefix for reveal.js (version 3.x).
This defaults to the reveal CDN, but can be any url pointing to a copy
of reveal.js.
For speaker notes to work, this must be a relative path to a local
copy of reveal.js: e.g., "reveal.js".
If a relative path is given, it must be a subdirectory of the
current directory (from which the server is run).
See the usage documentation
(https://nbconvert.readthedocs.io/en/latest/usage.html#reveal-js-html-slideshow)
for more details.
Default: ''
Equivalent to: [--SlidesExporter.reveal_url_prefix]
--nbformat=<Enum>
The nbformat version to write.
Use this to downgrade notebooks.
Choices: any of [1, 2, 3, 4]
Default: 4
Equivalent to: [--NotebookExporter.nbformat_version]
Examples
--------
The simplest way to use nbconvert is
> jupyter nbconvert mynotebook.ipynb --to html
Options include ['asciidoc', 'custom', 'html', 'latex', 'markdown', 'notebook', 'pdf', 'python', 'rst', 'script', 'slides', 'webpdf'].
> jupyter nbconvert --to latex mynotebook.ipynb
Both HTML and LaTeX support multiple output templates. LaTeX includes
'base', 'article' and 'report'. HTML includes 'basic', 'lab' and
'classic'. You can specify the flavor of the format used.
> jupyter nbconvert --to html --template lab mynotebook.ipynb
You can also pipe the output to stdout, rather than a file
> jupyter nbconvert mynotebook.ipynb --stdout
PDF is generated via latex
> jupyter nbconvert mynotebook.ipynb --to pdf
You can get (and serve) a Reveal.js-powered slideshow
> jupyter nbconvert myslides.ipynb --to slides --post serve
Multiple notebooks can be given at the command line in a couple of
different ways:
> jupyter nbconvert notebook*.ipynb
> jupyter nbconvert notebook1.ipynb notebook2.ipynb
or you can specify the notebooks list in a config file, containing::
c.NbConvertApp.notebooks = ["my_notebook.ipynb"]
> jupyter nbconvert --config mycfg.py
To see all available configurables, use `--help-all`.